Study 1: Direct Wish et al., 1976 Replication

Study Information

25 relationships, rated on 25 dimensions (bipolar scales)

Data preparation and cleaning

In [1]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

#from matplotlib.collections import LineCollection
from sklearn import manifold, preprocessing
#from sklearn.metrics import euclidean_distances
from sklearn.decomposition import PCA

Import qualtrics data

Create dataframes for the raw data, raw data with reorganized headers, and a dataframe for just the responses

In [2]:
work_dir = os.path.expanduser('~/Google Drive/olson_lab/projects/relationship_knowledge/surveys/wish_replication/')
os.chdir(work_dir)

output_dir = os.path.expanduser('~/Google Drive/olson_lab/projects/relationship_knowledge/results/wish_replication/')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Grab the qualtrics output
qual_output = [i for i in os.listdir('./') if i.startswith('RK')][0]

# Prep responses and key
raw = pd.read_csv(qual_output)
raw_reorg = raw.drop(axis='index',index=[0,1])
raw_reorg['subject'] = list(raw_reorg.index)
raw_reorg = raw_reorg.reset_index(drop=True)
raw_reorg['Duration (in seconds)'] = pd.to_numeric(raw_reorg['Duration (in seconds)'])
key = pd.read_csv('qualtrics_question_key.csv')

responses = raw_reorg
responses['subject'] = list(raw_reorg.index)
responses = responses.loc[:, responses.columns.str.startswith('Q')]
responses = responses[responses.columns[1:-7]]

# Get total number of responses
print("Total number of responses from qualtrics - "+str(len(responses)))
Total number of responses from qualtrics - 721

Sample Demographics

In [3]:
# Demographics
demo_orig = pd.read_csv(qual_output, header=1, 
                           usecols=range(len(raw_reorg.iloc[0])-9,
                                         len(raw_reorg.iloc[0])-1))
demo_orig = demo_orig.drop(axis='index',index=0)
demographics = demo_orig
#demographics = demographics.drop(axis='index',index=0)


# Plot demographic information
# Set up the matplotlib figure
#sns.set(font_scale=1.2)
#sns.set_style("whitegrid")
f, axes = plt.subplots(4, 1, figsize=(10, 10), sharex=False)
f.subplots_adjust(wspace = 10, hspace=1.5)
sns.despine(left=True)
sns.countplot(demographics['Age'], ax=axes[0], order=['18 - 24', '25 - 34',
                                                      '35 - 44', '45 - 54',
                                                      '55 - 64', '65 - 74',
                                                      '75 - 84'])
sns.countplot(demographics['Sex'], ax=axes[1])
sns.countplot(demographics['Race'], ax=axes[2])
sns.countplot(demographics['Highest Level of Education'], ax=axes[3],
              order=['Less than high school', 'High school graduate',
                     'Some college', '2 year degree', '4 year degree',
                     'Professional degree','Doctorate'])

demographics['Sex'].describe(include="all")
Out[3]:
count      699
unique       3
top       Male
freq       349
Name: Sex, dtype: object

Dimension Dictionaries

Create a dictionary that will store dataframes for each dimensions
Dataframes have ratings for all relationships

In [4]:
relationships = [x for x in raw.iloc[0].tolist() if x.startswith('Q8.1')]
relationships = [s.replace('\xe2\x80\x93', '-') for s in relationships]
relationships = [x[7:] for x in relationships]
dimension_frames = {}
count=0
for dim in key['dimension'].tolist()[:-2]:  # exclude foil and demographics
    filtered_cols = [col for col in raw_reorg if col.startswith('Q'+str(key['block'].iloc[count])+'.1')]
    dimension_frames[dim] = pd.DataFrame(responses[filtered_cols])
    dimension_frames[dim].columns = relationships
    # Delete all data for "between cousins" and only keep "second cousins"
    dimension_frames[dim]['Between second cousins'].iloc[:629] = np.nan
    count=count+1

Attention Check Quesions

Foil questions were used to catch participants who were not paying attention or taking the survey seriously. Check total number of responses for each word before and after removing bad participants.

In [5]:
num_response = []
for col in dimension_frames['Active vs Inactive'].columns:
    num_response.append(dimension_frames['Active vs Inactive'][col].count())
plt.figure()
sns.barplot(x=key['dimension'].iloc[:25],y=num_response)


# Check foils to see if any subjs responded indiscriminately
foil_relationships = [x for x in raw.iloc[0].tolist() if x.startswith('Q37')]
foil_relationships = [x[6:] for x in foil_relationships]
filtered_cols = [col for col in raw_reorg if col.startswith('Q37')]
foils = pd.DataFrame(responses[filtered_cols])
foils.columns = foil_relationships
foils['MTurkCode']  =raw_reorg['MTurkCode']
foils = foils.apply(pd.to_numeric)
foils_melt = pd.melt(foils, id_vars='MTurkCode')

# Check for values above means for unexpected foils
outliers_list = []
for rel in foils.columns[:-1]:
    outliers_list = outliers_list + list(foils[(foils[rel] > 
                   foils[rel].mean()+foils[rel].std()*2) |
                   (foils[rel] < 
                    foils[rel].mean()-foils[rel].std()*2)].index)
outliers_list = set(outliers_list)



# Remove outliers based on assumptions of words
# Frequency of deathbed > car
# Removing subjects based on completetion time is not the best (they do okay)
foil_outliers = foils
foil_outliers = foil_outliers[~foil_outliers.index.isin(outliers_list)]
foil_outliers_melt = pd.melt(foil_outliers, id_vars='MTurkCode')


f, axes = plt.subplots(2, 1, figsize=(10, 7), sharex=False)
sns.despine(left=True)
sns.stripplot(x="variable", y="value", data=foils_melt, 
              color='.3', jitter=True, ax=axes[0])
sns.stripplot(x="variable", y="value", data=foil_outliers_melt, 
              color='.3', jitter=True,  ax=axes[1])
sns.boxplot(x="variable", y="value", data=foils_melt, whis=np.inf, ax=axes[0])
sns.boxplot(x="variable", y="value", data=foil_outliers_melt, whis=np.inf, ax=axes[1])
plt.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.0)


# Remove outliers from the rest of the data
dimension_frames_outliers = dimension_frames.copy()
for dim in dimension_frames_outliers:
    dimension_frames_outliers[dim] = dimension_frames[dim][~dimension_frames[dim].index.isin(outliers_list)]
    dimension_frames_outliers[dim] = dimension_frames_outliers[dim].apply(pd.to_numeric)
    
    
# Check total number of responses for each word with outliers excluded
num_response = []
for col in dimension_frames_outliers['Active vs Inactive'].columns:
    num_response.append(dimension_frames_outliers['Active vs Inactive'][col].count())
plt.figure()
sns.barplot(x=key['dimension'].iloc[:25],y=num_response)

print("Total number of responses, excluding bad participants - "+str(len(dimension_frames_outliers['Active vs Inactive'])))
No handles with labels found to put in legend.
Total number of responses, excluding bad participants - 592

Dimension Relationship Ratings

Create a matrix of the average ratings of relationship for each dimension

In [6]:
# Create a dataframe that has the average response for each relationship, on each dimension
dim_rel = pd.DataFrame(columns=relationships)
count=0
for dim in dimension_frames_outliers.keys():
    dim_rel.loc[count] = dimension_frames_outliers[dim].mean().tolist()
    count = count + 1
dim_rel.index = dimension_frames_outliers.keys()
dim_rel.to_csv(output_dir + 'dim_rel.csv')


plt.figure(figsize=(15,15))
heatmap = sns.heatmap(dim_rel, center=50,cmap="RdBu_r")
fig = heatmap.get_figure()
fig.savefig(output_dir+'dim_rel_heatmap.png')

# Scale the matrix. This is typically in PCA so that the dimensions are normed.
dim_rel_scaled = preprocessing.scale(dim_rel.transpose())
dim_rel_scaled_df = pd.DataFrame(dim_rel_scaled, index=dim_rel.columns,
                                 columns=dim_rel.index)

dim_rel_scaled_df
dim_rel_scaled_df.to_csv(output_dir+'/dim_rel_scaled.csv')

Principal Component Analysis

We will complete the PCA analysis in R. Various PCA packages will be used to take advantage of their specific strengths

In [1]:
if(!require(FactoMineR)) install.packages("FactoMineR"); require(FactoMineR)
if(!require(factoextra)) install.packages("factoextra"); require(factoextra)
library(psych)
library(ggplot2)
if(!require(reshape)) install.packages("reshape"); require(reshape)
library(cowplot)
library(stringr)
if(!require(plotly)) install.packages("plotly"); require(plotly)


setwd("C:/Users/Administrator/Google_Drive/olson_lab/projects/relationship_knowledge/results/wish_replication/")
#setwd("~/Google_Drive/olson_lab/projects/relationship_knowledge/results/wish_replication/")
getwd()
Loading required package: FactoMineR

Loading required package: factoextra

Loading required package: ggplot2

Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa


Attaching package: 'psych'


The following objects are masked from 'package:ggplot2':

    %+%, alpha


Loading required package: reshape


Attaching package: 'cowplot'


The following object is masked from 'package:reshape':

    stamp


Loading required package: plotly


Attaching package: 'plotly'


The following object is masked from 'package:reshape':

    rename


The following object is masked from 'package:ggplot2':

    last_plot


The following object is masked from 'package:stats':

    filter


The following object is masked from 'package:graphics':

    layout


'C:/Users/Administrator/Google_Drive/olson_lab/projects/relationship_knowledge/results/wish_replication'

Quantitatively selecting the number of components

PCA will output the same number of components as there are dimension inputs. As the components are ranked by how much variance they explain, we can exclude some components which do not add much additional information.

Parallel Analysis

We weill use parallel analysis to indicate what the optimal number of components to include would be.

In [2]:
# import that relationship dimension ratings from the python output
dim_rel_scaled = read.csv('dim_rel_scaled.csv', row.names=1)
rownames(dim_rel_scaled) <- str_replace(rownames(dim_rel_scaled), "â\200“", "-")

png(filename="pca_results/parallel_analysis.png")
fa.parallel(dim_rel_scaled,fa="pc")
dev.off()
fa.parallel(dim_rel_scaled,fa="pc")
Warning message in cor.smooth(R):
"Matrix was not positive definite, smoothing was done"
Warning message in cor.smooth(R):
"Matrix was not positive definite, smoothing was done"
Warning message in cor.smooth(R):
"Matrix was not positive definite, smoothing was done"
Warning message in cor.smooth(r):
"Matrix was not positive definite, smoothing was done"
Warning message in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
"The estimated weights for the factor scores are probably incorrect.  Try a different factor score estimation method."
In factor.scores, the correlation matrix is singular, an approximation is used

Warning message in cor.smooth(r):
"Matrix was not positive definite, smoothing was done"
Parallel analysis suggests that the number of factors =  NA  and the number of components =  2 
png: 2
Warning message in cor.smooth(R):
"Matrix was not positive definite, smoothing was done"
Warning message in cor.smooth(R):
"Matrix was not positive definite, smoothing was done"
Warning message in cor.smooth(R):
"Matrix was not positive definite, smoothing was done"
Warning message in cor.smooth(r):
"Matrix was not positive definite, smoothing was done"
Warning message in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
"The estimated weights for the factor scores are probably incorrect.  Try a different factor score estimation method."
In factor.scores, the correlation matrix is singular, an approximation is used

Warning message in cor.smooth(r):
"Matrix was not positive definite, smoothing was done"
Parallel analysis suggests that the number of factors =  NA  and the number of components =  2 

Parallel analysis indicates that having only 2 components would be optimal. But to better match the literature, and to be consistent with possible future analyses, we will include 4 components.

PCA with no rotation

In [3]:
# Run PCA with no rotation
res.pca_nor <- PCA(dim_rel_scaled, ncp=4, graph=FALSE)

write.csv(res.pca_nor$var$coord, file = 'pca_results/pca_loadings_nrotate.csv')
write.csv(res.pca_nor$ind$coord, file = 'pca_results/pca_relationships_nrotate.csv')

# Create screeplot
png(filename="pca_results/scree_plot.png")
fviz_screeplot(res.pca_nor) + ggtitle("") +
theme(text = element_text(size = 20))
dev.off()
fviz_screeplot(res.pca_nor) + ggtitle("") +
theme(text = element_text(size = 20))
png: 2

PCA with varimax rotation

Rotations are used in principal component analyses to be able to better interpret the data. There are two main types of rotations, varimax and oblimin. Here, we will use varimax rotation, as it will maximize the component loadings so that dimensions are more strongly loaded onto a single component, rather than across components. Because of this, our resulting components may correlate with each other. Oblimin rotation results in components that are uncorrelated to each other.

In [4]:
pv <- principal(dim_rel_scaled, 4, rotate="varimax")

pv$loadings <- pv$loadings[ order(row.names(pv$loadings)), ][ , order(colnames(pv$loadings)) ]
write.csv(pv$loadings, file = 'pca_results/pca_loadings_varimax.csv')
write.csv(pv$scores, file = 'pca_results/pca_relationships_varimax.csv')

print(paste0("First four components account for ", 
             round(pv$Vaccounted[3,4], digits = 4)*100,
            "% of the variance"))
Warning message in cor.smooth(r):
"Matrix was not positive definite, smoothing was done"
Warning message in principal(dim_rel_scaled, 4, rotate = "varimax"):
"The matrix is not positive semi-definite, scores found from Structure loadings"
[1] "First four components account for 94.36% of the variance"

Component Loadings

In [5]:
loadings_var <- cbind(melt(as.matrix(as.data.frame(pv$loadings[,1:4]))), "Varimax")
colnames(loadings_var)[4] <- 'Rotation'

ggplot(data = loadings_var, aes(x = X2, y = X1)) +
    geom_tile(aes(fill = value)) + 
    scale_fill_distiller(palette="RdBu", limits=c(-1,1)) +
    ggtitle("Varimax PCA Loadings") +
    ylab("Wish Dimensions") +
    theme_minimal() +
    theme(text = element_text(size = 12), axis.title.x=element_blank()) 
ggsave('pca_results/pca_loadings_varimax.png')
Saving 6.67 x 6.67 in image

In [6]:
comp1_loadings <- loadings_var[loadings_var$X2=='RC1',]
print(cat('Component 1 highest positive loadings:',
            gsub('\\.', ' ', toString(comp1_loadings[comp1_loadings$value > 0.5,]$X1))))
print(cat('\nComponent 1 highest negative loadings:',
            gsub('\\.', ' ', toString(comp1_loadings[comp1_loadings$value < -0.5,]$X1))))

comp2_loadings <- loadings_var[loadings_var$X2=='RC2',]
print(cat('\nComponent 2 highest positive loadings:',
            gsub('\\.', ' ', toString(comp2_loadings[comp2_loadings$value > 0.5,]$X1))))
print(cat('\nComponent 2 highest negative loadings:',
            gsub('\\.', ' ', toString(comp2_loadings[comp2_loadings$value < -0.5,]$X1))))

comp3_loadings <- loadings_var[loadings_var$X2=='RC3',]
print(cat('\nComponent 3 highest positive loadings:',
            gsub('\\.', ' ', toString(comp3_loadings[comp3_loadings$value > 0.5,]$X1))))
print(cat('\nComponent 3 highest negative loadings:',
            gsub('\\.', ' ', toString(comp3_loadings[comp3_loadings$value < -0.5,]$X1))))

comp4_loadings <- loadings_var[loadings_var$X2=='RC4',]
print(cat('\nComponent 4 highest positive loadings:',
            gsub('\\.', ' ', toString(comp4_loadings[comp4_loadings$value > 0.5,]$X1))))
print(cat('\nComponent 4 highest negative loadings:',
            gsub('\\.', ' ', toString(comp4_loadings[comp4_loadings$value < -0.5,]$X1))))
Component 1 highest positive loadings: Altruistic vs Selfish, Compatible vs incompatible goals and desires, Cooperative vs Competitive, Democratic vs Autocratic, Easy vs Difficult to resolve conflicts with each other, Emotionally close vs distant, Fair vs Unfair, Flexible vs Rigid, Friendly vs Hostile, Harmonious vs Clashing, Important vs Unimportant to individuals involved, Important vs Unimportant to society, Productive vs Destructive, Relaxed vs Tense, Sincere vs InsincereNULL

Component 1 highest negative loadings: NULL

Component 2 highest positive loadings: Difficult vs Easy to break off contact with each other, Emotional vs Intellectual, Emotionally close vs distant, Informal vs Formal, Intense vs Superficial feelings toward each other, Intense vs Superficial interaction with each other, Interesting vs Dull, Pleasure vs Work orientedNULL

Component 2 highest negative loadings: NULL

Component 3 highest positive loadings: Democratic vs Autocratic, Equal vs Unequal, Similar vs Different roles and behaviorNULL

Component 3 highest negative loadings: NULL

Component 4 highest positive loadings: Active vs Inactive, Important vs Unimportant to individuals involved, Important vs Unimportant to society, Intense vs Superficial interaction with each other, Interesting vs DullNULL

Component 4 highest negative loadings: NULL

Comparison to Wish Dimensions

Wish et al., 1976 - Table 2

In [7]:
library(dplyr)

# Import dimension loadings from Wish et al., 1976
wish_dim1 = c(.89, .87, .87, .87, .85, .84, .76, .75, .73, .02, .11, 
             .12, .10, -.03, .12, .16, -.03, .32, -.13, .40, .42, .47, 
             .55, .59, .20)
wish_dim2 = c(.04, .04, .04, .06, .04, .05, .04, .22, .07, .96, .91, 
             .07, .10, .02, .12, .06, .06, .05, -.01, .63, .02, .04,
             .10, .14, -.01)
wish_dim3 = c(-.01, .04, .08, .10, .16, .03, .07, .07, .00, .02, .09, 
             .81, .75, .73, .64, .53, -.08, .14, .22, .00, .64, .36, 
             .25, .00, .53)
wish_dim4 = c(-.10, -.06, .12, .01, -.06, .04, .24, .13, .29, .03, .02,
             -.06, .18, .22, .15, .01, .95, .70, .70, .27, .08, .42, 
             .40, .43, .43)
wish_loadings <- data.frame(wish_dim1, wish_dim2, wish_dim3, wish_dim4)
wish_loadings$dims <- c('Harmonious.vs.Clashing', 'Cooperative.vs.Competitive', 'Friendly.vs.Hostile',
  'Compatible.vs.incompatible.goals.and.desires', 'Productive.vs.Destructive',
  'Easy.vs.Difficult.to.resolve.conflicts.with.each.other','Altruistic.vs.Selfish',
  'Fair.vs.Unfair', 'Relaxed.vs.Tense', 'Equal.vs.Unequal','Similar.vs.Different.roles.and.behavior',
  'Active.vs.Inactive','Intense.vs.Superficial.interaction.with.each.other',
  'Intense.vs.Superficial.feelings.toward.each.other','Interesting.vs.Dull',
  'Important.vs.Unimportant.to.society','Pleasure.vs.Work.oriented','Informal.vs.Formal',
  'Emotional.vs.Intellectual','Democratic.vs.Autocratic','Important.vs.Unimportant.to.individuals.involved',
  'Emotionally.close.vs.distant','Sincere.vs.Insincere','Flexible.vs.Rigid',
  'Difficult.vs.Easy.to.break.off.contact.with.each.other')

# Rearrange rows so that they match the PCA loadings dataframe
wish_loadings <- wish_loadings %>%
  mutate(dims =  factor(dims, levels = rownames(pv$loadings[,1:4]))) %>%
  arrange(dims)

rownames(wish_loadings) <- wish_loadings$dims
wish_loadings <- wish_loadings[,1:4]
wish_loadings
write.csv(wish_loadings, file = 'pca_results/wish_loadings.csv')
Attaching package: 'dplyr'


The following object is masked from 'package:reshape':

    rename


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


A data.frame: 25 × 4
wish_dim1wish_dim2wish_dim3wish_dim4
<dbl><dbl><dbl><dbl>
Active.vs.Inactive 0.12 0.07 0.81-0.06
Altruistic.vs.Selfish 0.76 0.04 0.07 0.24
Compatible.vs.incompatible.goals.and.desires 0.87 0.06 0.10 0.01
Cooperative.vs.Competitive 0.87 0.04 0.04-0.06
Democratic.vs.Autocratic 0.40 0.63 0.00 0.27
Difficult.vs.Easy.to.break.off.contact.with.each.other 0.20-0.01 0.53 0.43
Easy.vs.Difficult.to.resolve.conflicts.with.each.other 0.84 0.05 0.03 0.04
Emotional.vs.Intellectual-0.13-0.01 0.22 0.70
Emotionally.close.vs.distant 0.47 0.04 0.36 0.42
Equal.vs.Unequal 0.02 0.96 0.02 0.03
Fair.vs.Unfair 0.75 0.22 0.07 0.13
Flexible.vs.Rigid 0.59 0.14 0.00 0.43
Friendly.vs.Hostile 0.87 0.04 0.08 0.12
Harmonious.vs.Clashing 0.89 0.04-0.01-0.10
Important.vs.Unimportant.to.individuals.involved 0.42 0.02 0.64 0.08
Important.vs.Unimportant.to.society 0.16 0.06 0.53 0.01
Informal.vs.Formal 0.32 0.05 0.14 0.70
Intense.vs.Superficial.feelings.toward.each.other-0.03 0.02 0.73 0.22
Intense.vs.Superficial.interaction.with.each.other 0.10 0.10 0.75 0.18
Interesting.vs.Dull 0.12 0.12 0.64 0.15
Pleasure.vs.Work.oriented-0.03 0.06-0.08 0.95
Productive.vs.Destructive 0.85 0.04 0.16-0.06
Relaxed.vs.Tense 0.73 0.07 0.00 0.29
Similar.vs.Different.roles.and.behavior 0.11 0.91 0.09 0.02
Sincere.vs.Insincere 0.55 0.10 0.25 0.40
In [8]:
# Merge together PCA and wish loadings dataframes
loadings_all_temp <- merge(pv$loadings[,1:4], wish_loadings, by="row.names")
loadings_all <- loadings_all_temp[,-1]  # Reset the index to be relationships
rownames(loadings_all) <- loadings_all_temp[,1]
colnames(loadings_all) <- c('RC1','RC2','RC3','RC4',
                              'Dim1','Dim2','Dim3','Dim4')

# Calculate correlation matrix of all dimensions across all methods
loadings_all.cor = cor(loadings_all, method=c("spearman"))

# Create heatmap plot 
loadings_all.cor_melt <- melt(loadings_all.cor)

limit <- max(abs(loadings_all.cor_melt$value)) * c(-1, 1)
ggplot(data = loadings_all.cor_melt, aes(x = X2, y = X1)) +
    geom_tile(aes(fill = value)) + 
    scale_fill_distiller(palette="RdBu", limit=limit) + 
    ggtitle('Principal Component & Wish Dimension Loadings Comparison') +
    theme(text = element_text(size = 8), 
          plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5),
          axis.title.x = element_blank(),
          axis.title.y = element_blank())
In [9]:
loading_cor_output <- cor.test(pv$loadings[,4], wish_loadings$wish_dim3, 
                               method = "spearman")
loading_cor_output$estimate
loading_cor_output$p.value
Warning message in cor.test.default(pv$loadings[, 4], wish_loadings$wish_dim3, method = "spearman"):
"Cannot compute exact p-value with ties"
rho: 0.809937188753296
9.32490847173975e-07
In [10]:
#max(loadings_all.cor[5:8,1])
colMax <- function(loadings_all.cor) sapply(loadings_all.cor, max, na.rm = TRUE)
colMax
function (loadings_all.cor) 
sapply(loadings_all.cor, max, na.rm = TRUE)
In [11]:
pv$loadings[,1:4]
A matrix: 25 × 4 of type dbl
RC1RC2RC3RC4
Active.vs.Inactive 0.18687206 0.152738825 0.14468324 0.866603840
Altruistic.vs.Selfish 0.86146402 0.368038536 0.04168189 0.286096769
Compatible.vs.incompatible.goals.and.desires 0.91208207 0.035619487 0.18978870 0.285562670
Cooperative.vs.Competitive 0.95378052 0.150443925-0.10419346 0.114001972
Democratic.vs.Autocratic 0.66201402 0.217627777 0.68292941 0.088538474
Difficult.vs.Easy.to.break.off.contact.with.each.other 0.25961945 0.790372813-0.04613186 0.361798123
Easy.vs.Difficult.to.resolve.conflicts.with.each.other 0.97636818 0.079764948 0.09780439 0.093597421
Emotional.vs.Intellectual-0.06052608 0.965400663 0.07006948-0.049572205
Emotionally.close.vs.distant 0.59926305 0.699999012 0.19350853 0.304903633
Equal.vs.Unequal 0.17769489 0.253989355 0.93674788-0.003832917
Fair.vs.Unfair 0.87408309 0.172206907 0.31591522 0.267145158
Flexible.vs.Rigid 0.76097194 0.416135321 0.46118052 0.046678360
Friendly.vs.Hostile 0.96297553 0.153367311 0.13743462 0.140145833
Harmonious.vs.Clashing 0.97574765-0.022443310 0.10216636-0.004251832
Important.vs.Unimportant.to.individuals.involved 0.51068340 0.397505649 0.02614887 0.713245435
Important.vs.Unimportant.to.society 0.50295093-0.002776676-0.27325614 0.690731176
Informal.vs.Formal 0.20289159 0.882248259 0.35106697-0.047161744
Intense.vs.Superficial.feelings.toward.each.other 0.05940895 0.917079860 0.16934938 0.320412760
Intense.vs.Superficial.interaction.with.each.other 0.07017843 0.787766017 0.13369363 0.551030174
Interesting.vs.Dull 0.10184915 0.588442555 0.41377009 0.624191518
Pleasure.vs.Work.oriented 0.36125491 0.833366049 0.35155911-0.070385807
Productive.vs.Destructive 0.92797072-0.084064182 0.03342386 0.334417762
Relaxed.vs.Tense 0.88306285 0.337572570 0.26764718 0.043876046
Similar.vs.Different.roles.and.behavior 0.09043396 0.202985504 0.94302713 0.101417831
Sincere.vs.Insincere 0.79301723 0.463866496 0.08130511 0.356769567

Relationship Plots

In [12]:
ggplot(as.data.frame(pv$scores), aes(x=RC1, y=RC2)) +
  geom_point(size=0) + 
  geom_text(label=rownames(pv$scores), size=3) +
  theme(text = element_text(size = 12)) +
  xlim(-25, 25)
ggsave('pca_results/pca_relationships_varimax_rc1rc2.png')

ggplot(as.data.frame(pv$scores), aes(x=RC3, y=RC4)) +
  geom_point(size=0) + 
  geom_text(label=rownames(pv$scores), size=3) +
  theme(text = element_text(size = 12)) +
  xlim(-7, 6)
ggsave('pca_results/pca_relationships_varimax_rc3rc4.png')
Saving 6.67 x 6.67 in image

Warning message:
"Removed 5 rows containing missing values (geom_point)."
Warning message:
"Removed 5 rows containing missing values (geom_text)."
Saving 6.67 x 6.67 in image

Warning message:
"Removed 5 rows containing missing values (geom_point)."
Warning message:
"Removed 5 rows containing missing values (geom_text)."
In [13]:
fig <- plot_ly(as.data.frame(pv$scores), x=~RC1, y=~RC2,
             type = "scatter", mode = "markers",
             text = rownames(pv$scores),
             hoverinfo = "text")
x <- list(title = paste('PC1 (', round(res.pca_nor$eig[1,2], 2), '%)'))
y <- list(title = paste('PC2 (', round(res.pca_nor$eig[2,2], 2), '%)'))

fig <- fig %>% layout(xaxis = x, yaxis = y)
htmlwidgets::saveWidget(as_widget(fig), "index.html")
Warning message:
"`arrange_()` is deprecated as of dplyr 0.7.0.
Please use `arrange()` instead.
See vignette('programming') for more help
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated."
In [15]:
fig <- plot_ly(as.data.frame(pv$scores), x=~RC3, y=~RC4,
             type = "scatter", mode = "markers",
             text = rownames(pv$scores),
             hoverinfo = "text")
x <- list(title = paste('PC3 (', round(res.pca_nor$eig[3,2], 2), '%)'))
y <- list(title = paste('PC4 (', round(res.pca_nor$eig[4,2], 2), '%)'))

fig <- fig %>% layout(xaxis = x, yaxis = y)
fig
#embed_notebook(fig, height="500")
In [ ]:
pv$scores

Summary

PC1 = Valence
PC2 = Formality
PC3 = Equality
PC4 = Activeness

In [ ]:
library(gridExtra)

# Component loadings
fixed_comp_labels <- str_wrap(gsub('[.]', ' ', 
                                   lapply(loadings_var$X1[1:25], 
                                          as.character)), width=30)
comp_loadings <- ggplot(data = loadings_var, aes(x = X2, y = X1)) +
    geom_tile(aes(fill = value)) + 
    scale_x_discrete(labels = c('PC1', 'PC2', 'PC3', 'PC4')) + 
    scale_y_discrete(labels = fixed_comp_labels) + 
    scale_fill_distiller(palette="RdBu", limits=c(-1,1)) +
    ggtitle("Varimax PCA Loadings") +
    ylab("Wish Dimensions") +
    theme_minimal() +
    theme(text = element_text(size = 16), axis.title.x=element_blank()) 

# Relationship scatter plots
scatter_rc1_rc2 <- ggplot(as.data.frame(pv$scores), aes(x=RC1, y=RC2)) +
                      geom_point(size=1) + 
                      geom_text(label=rownames(pv$scores), 
                                check_overlap=TRUE) +
                      theme(text = element_text(size = 16),
                            panel.grid.major = element_blank(), 
                            panel.grid.minor = element_blank(), 
                            panel.background = element_blank(), 
                            axis.line = element_line(colour = "white")) +
                      geom_hline(aes(yintercept=0), color="grey") + 
                      geom_vline(aes(xintercept=0), color="grey") +
                      xlab(paste('PC1 (', round(res.pca_nor$eig[1,2], 2), '%)')) +
                      ylab(paste('PC2 (', round(res.pca_nor$eig[2,2], 2), '%)')) +
                      xlim(-25, 30)

scatter_rc3_rc4 <- ggplot(as.data.frame(pv$scores), aes(x=RC3, y=RC4)) +
                      geom_point(size=1) + 
                      geom_text(label=rownames(pv$scores),
                               check_overlap=TRUE) +
                      theme(text = element_text(size = 16),
                            panel.grid.major = element_blank(), 
                            panel.grid.minor = element_blank(), 
                            panel.background = element_blank(), 
                            axis.line = element_line(colour = "white")) +
                      geom_hline(aes(yintercept=0), color="grey") + 
                      geom_vline(aes(xintercept=0), color="grey") +
                      xlab(paste('PC3 (', round(res.pca_nor$eig[3,2], 2), '%)')) +
                      ylab(paste('PC4 (', round(res.pca_nor$eig[4,2], 2), '%)')) +
                      xlim(-15, 12)

# create blank plot for merging scatter and density plots
  blankPlot <- ggplot()+geom_blank(aes(1,1))+
    theme(plot.background = element_blank(), panel.grid.major = element_blank(),panel.grid.minor = element_blank(), 
          panel.border = element_blank(),panel.background = element_blank(),axis.title.x = element_blank(),
          axis.title.y = element_blank(),axis.text.x = element_blank(), axis.text.y = element_blank(), 
          axis.ticks = element_blank())
  
# Organize plots into one figure
lay <- rbind(c(1,1,2,2),
             c(1,1,2,2),
             c(1,1,3,3),
             c(1,1,3,3))
options(repr.plot.width=20, repr.plot.height=16)
g <- grid.arrange(comp_loadings, scatter_rc1_rc2, scatter_rc3_rc4, layout_matrix = lay)

ggsave('pca_results/summary.png', g, width=12, height=12, dpi=300)

Compare results for no rotation, varimax, and oblimin

Run oblimin rotation

In [ ]:
po <- principal(dim_rel_scaled, 4, rotate="oblimin")
write.csv(po$loadings, file = 'pca_results/pca_loadings_oblimin.csv')
write.csv(po$scores, file = 'pca_results/pca_relationships_oblimin.csv')

Compare loading plots

In [ ]:
#test <- melt(pv$loadings)
#library(reshape)
loadings_nor <- cbind(melt(as.matrix(as.data.frame(res.pca_nor$var$coord))), "No Rotation")
colnames(loadings_nor)[4] <- 'Rotation'
loadings_obl <- cbind(melt(as.matrix(as.data.frame(po$loadings[,1:4]))), "Oblimin")
colnames(loadings_obl)[4] <- 'Rotation'

loadings_all <- rbind(loadings_nor, loadings_var, loadings_obl)

ggplot(data = loadings_all, aes(x = X2, y = X1)) +
    geom_tile(aes(fill = value)) + 
    scale_fill_distiller(palette="RdBu") + 
    facet_wrap(~Rotation,scales="free_x") +
    theme(text = element_text(size = 8))

Compare relationship plots

In [ ]:
plot1 <- ggplot(as.data.frame(res.pca_nor$ind$coord), aes(x=Dim.1, y=Dim.2)) +
  geom_point() + 
  geom_text(label=rownames(res.pca_nor$ind$coord), size=2)
plot2 <- ggplot(as.data.frame(res.pca_nor$ind$coord), aes(x=Dim.3, y=Dim.4)) +
  geom_point() + 
  geom_text(label=rownames(res.pca_nor$ind$coord), size=2)

plot3 <- ggplot(as.data.frame(pv$scores), aes(x=RC1, y=RC2)) +
  geom_point() + 
  geom_text(label=rownames(pv$scores), size=2)
plot4 <- ggplot(as.data.frame(pv$scores), aes(x=RC3, y=RC4)) +
  geom_point() + 
  geom_text(label=rownames(pv$scores), size=2)

plot5 <- ggplot(as.data.frame(po$scores), aes(x=TC1, y=TC2)) +
  geom_point() + 
  geom_text(label=rownames(pv$scores), size=2)
plot6 <- ggplot(as.data.frame(po$scores), aes(x=TC3, y=TC4)) +
  geom_point() + 
  geom_text(label=rownames(pv$scores), size=2)

plot_grid(plot1, plot2, plot3, plot4, plot5, plot6, ncol=2, 
          labels=c('A', 'B','C','D','E','F'))

#options(warn=-1)

Multidimensional Scaling (MDS)

Classical MDS

Classical MDS is very similar to PCA with no rotations.

In [ ]:
dist_mtx <- dist(dim_rel_scaled)

# Run MDS with 4 dimensions to be returned
fit_cl_mds <- cmdscale(dist_mtx, eig=TRUE, k=4)
colnames(fit_cl_mds$points) <- c('Dim1', 'Dim2', 'Dim3', 'Dim4')

Relationship Plots

In [ ]:
# Plot relationships
ggplot(as.data.frame(fit_cl_mds$points), aes(x=Dim1, y=Dim2)) +
  geom_point(size=0) + 
  geom_text(label=rownames(fit_cl_mds$points), size=3) +
  theme(text = element_text(size = 12)) +
  xlim(-5, 7)
ggplot(as.data.frame(fit_cl_mds$points), aes(x=Dim3, y=Dim4)) +
  geom_point(size=0) + 
  geom_text(label=rownames(fit_cl_mds$points), size=3) +
  theme(text = element_text(size = 12)) +
  xlim(-5, 4)

Dimension 1 = Valence
Dimension 2 = Formality
Dimension 3 = Activeness?
Dimension 4 = ?

Nonmetric MDS

In [ ]:
library(MASS)

# Run MDS with 4 dimensions to be returned
fit_nm_mds <- isoMDS(dist_mtx, k=4)
colnames(fit_nm_mds$points) <- c('Dim1', 'Dim2', 'Dim3', 'Dim4')

Relationship Plots

In [ ]:
# Plot relationships
ggplot(as.data.frame(fit_nm_mds$points), aes(x=Dim1, y=Dim2)) +
  geom_point(size=0) + 
  geom_text(label=rownames(fit_nm_mds$points), size=3) +
  theme(text = element_text(size = 12)) +
  xlim(-5, 7)
ggplot(as.data.frame(fit_nm_mds$points), aes(x=Dim3, y=Dim4)) +
  geom_point(size=0) + 
  geom_text(label=rownames(fit_nm_mds$points), size=3) +
  theme(text = element_text(size = 12)) +
  xlim(-5, 4)

Dimension 1 = Valence
Dimension 2 = Formality
Dimension 3 = Equality?
Dimension 4 = Activeness?

Compare PCA, MDS, Nonmetric MDS

Here we will compare the results from the PCA with varimax rotation, classical MDS, and nonmetric MDS analyses.

Relationship Plots

In [ ]:
plot1 <- ggplot(as.data.frame(pv$scores), aes(x=RC1, y=RC2)) +
  geom_point() + 
  geom_text(label=rownames(pv$scores), size=2)
plot2 <- ggplot(as.data.frame(pv$scores), aes(x=RC3, y=RC4)) +
  geom_point() + 
  geom_text(label=rownames(pv$scores), size=2)

plot3 <- ggplot(as.data.frame(fit_cl_mds$points), aes(x=Dim1, y=Dim2)) +
  geom_point() + 
  geom_text(label=rownames(fit_cl_mds$points), size=2)
plot4 <- ggplot(as.data.frame(fit_cl_mds$points), aes(x=Dim3, y=Dim4)) +
  geom_point() + 
  geom_text(label=rownames(fit_cl_mds$points), size=2)

plot5 <- ggplot(as.data.frame(fit_nm_mds$points), aes(x=Dim1, y=Dim2)) +
  geom_point() + 
  geom_text(label=rownames(fit_nm_mds$points), size=2)
plot6 <- ggplot(as.data.frame(fit_nm_mds$points), aes(x=Dim3, y=Dim4)) +
  geom_point() + 
  geom_text(label=rownames(fit_nm_mds$points), size=2)

plot_grid(plot1, plot2, plot3, plot4, plot5, plot6, ncol=2, 
          labels=c('A', 'B','C','D','E','F'))

Dimension Correlations

In [ ]:
# Merge together PCA, MDS, and nonmetric MDS dataframes
rel_scores_all_temp <- merge(pv$scores[,1:4], fit_cl_mds$points[,1:4], by="row.names")
rel_scores_all <- rel_scores_all_temp[,-1]  # Reset the index to be relationships
rownames(rel_scores_all) <- rel_scores_all_temp[,1]
#colnames(rel_scores_all) <- c('RC1','RC2','RC3','RC4','Dim1','Dim2','Dim3','Dim4')
rel_scores_all_temp <- merge(rel_scores_all, fit_nm_mds$points[,1:4], by=0)
rel_scores_all <- rel_scores_all_temp[,-1]
rownames(rel_scores_all) <- rel_scores_all_temp[,1]
colnames(rel_scores_all) <- c('RC1','RC2','RC3','RC4',
                              'Dim1','Dim2','Dim3','Dim4',
                              'nDim1','nDim2','nDim3','nDim4')

# Calculate correlation matrix of all dimensions across all methods
rel_scores_all.cor = cor(rel_scores_all, method=c("spearman"))

# Create heatmap plot 
rel_scores_all.cor_melt <- melt(rel_scores_all.cor)

limit <- max(abs(rel_scores_all.cor_melt$value)) * c(-1, 1)
ggplot(data = rel_scores_all.cor_melt, aes(x = X2, y = X1)) +
    geom_tile(aes(fill = value)) + 
    scale_fill_distiller(palette="RdBu", limit=limit) + 
    ggtitle('PCA, MDS, nMDS Comparison') +
    theme(text = element_text(size = 8), 
          plot.title = element_text(color="black", size=14, face="bold", hjust = 0.5),
          axis.title.x = element_blank(),
          axis.title.y = element_blank())
In [ ]:
cor.test(rel_scores_all$nDim3, rel_scores_all$RC3, method = "spearman")

In PCA with varimax rotation, the dimension end up being correlated to each other. In classical and nonmetric MDS (nMDS), dimensions are not allowed to be correlated to each other. The dimensions from MDS and nMDS are very similar. The first MDS dimension is strongly correlated to the four PCA dimensions. MDS dimensions two and three is moderately correlated to PCA dimensions two and three.